###############################################################
###############################################################
#### Stefan Müller, Samuel Brazys, and Alexander Dukalskis
#### Replication Scripts for: 
#### Discourse Wars and 'Mask Diplomacy': China's Global Image Management in Times of Crisis 
#### Political Research Exchange, 2024
#### Link to paper: https://doi.org/10.1080/2474736X.2024.2309178
###############################################################
###############################################################

## Note: check the 000_README.pdf file on Harvard Dataverse for 
## the full replication instructions and information on all code scripts.
## Link to Dataverse repository: https://doi.org/10.7910/DVN/KRXMXJ
## Please contact the authors if you have any questions or suggestions. 
## Note: due to copyright restrictions some of the files cannot be shared publicly.
## However, we provide all replication scripts and intermediate objects to reproduce
## the plots and tables included in the paper and Supporting Information.

## This file performs the analysis of assessing if terms related to the White Paper
## are mentioned more often after receiving mask diplomacy support.

library(quanteda)
library(dplyr)
library(stringr)
library(lubridate)
library(countrycode)
library(ggplot2)
library(forcats)
library(quanteda.textstats)
library(quanteda.textplots)
library(MASS)
library(stringr)
library(ggeffects)
library(modelsummary)
library(tidyr)

# load custom ggplot2 scheme
source("function_theme_base.R")

dat_classified_all <- readRDS("data_dontshare/data_corpus_covid_full.rds")

# check how floor_date works
floor_date(as.Date("2020-01-05"), unit = "weeks",
           week_start = 1)

floor_date(as.Date("2020-01-06"), unit = "weeks",
           week_start = 1)


week(as.Date("2020-01-06"))


## compare mask diplomacy data
dat_treatments_all <- read.csv("data_maskdiplomacy.csv")

dat_treatments_no_rmc <- read.csv("data_maskdiplomacy_no_rmc.csv") %>% 
  rename(min_week_t_normc = min_week_t)

dat_treatments_merged <- left_join(dat_treatments_all,
                                   dat_treatments_no_rmc)

dat_treatments_merged <- dat_treatments_merged %>% 
  mutate(week_same = ifelse(min_week_t == min_week_t_normc, 1, 0)) %>% 
  mutate(weeks_diff = min_week_t - min_week_t_normc)

table(dat_treatments_merged$week_same)
table(dat_treatments_merged$weeks_diff)

# recode the weeks
dat_treatments <- dat_treatments_all %>% 
  mutate(week = ifelse(min_week_t == 99999, NA, min_week_t)) %>% 
  mutate(week_pre_mask = week - 1,
         week_post_mask = week + 1) %>% 
  rename(domain_country_merged = country)

dat_merged <- left_join(dat_classified_all,
                        dat_treatments, by = "domain_country_merged")

dat_merged$week_article <- isoweek(dat_merged$date)

dat_treated <- dat_merged %>% 
  mutate(treatment_group_3 = ifelse(
    week_article == week, "Week of Mask Diplomacy",
    ifelse(week_article == week_post_mask, "Week After Mask Diplomacy",
           ifelse(week_article == week_pre_mask, 
                  "Week Before Mask Diplomacy", NA))
  )) %>% 
  filter(language == "en") %>% # only English articles
  group_by(domain_country_merged) %>% 
  filter(!is.na(treatment_group_3)) %>% 
  mutate(n_articles_relevant = n()) %>% 
  filter(n_articles_relevant > 30) %>% 
  filter(!domain_country_merged %in% c("Georgia", "Croatia", "Liberia"))  # remove countries with few observations

# get number of articles per week
dat_treated <- dat_treated %>% 
  mutate(week = as.factor(as.character(isoweek(date)))) %>% 
  group_by(country, week) %>% 
  mutate(n_articles_week = n()) %>% 
  ungroup() %>% 
  mutate(docname = paste(country, treatment_group_3, n_articles_week, date, documentidentifier, sep = "__"))


table(dat_treated$treatment_group_3)

nrow(dat_treated)

# create text corpus
corp_treatment <- corpus(dat_treated)

docnames(corp_treatment) <- dat_treated$docname

ndoc(corp_treatment)

# overview of articles by country in relevant time window
table(corp_treatment$country)

# tokenize text corpus
toks_treated <- tokens(corp_treatment,
                       remove_punct = TRUE,
                       remove_numbers = TRUE,
                       remove_symbols = TRUE,
                       remove_url = TRUE)


# find collocations
# set.seed(235)
# tstat_col <- toks_treated %>% 
#   tokens_sample(size = 500) %>% 
#   tokens_remove(pattern = stopwords("en"),
#                 padding = TRUE) %>% 
#   textstat_collocations(min_count = 3, size = 2:3)
# 
# head(tstat_col)

# compound important multiword expressions

# head(tstat_col$collocation, 100)

compound_phrases <- phrase(c("public health",
                     "world health org*",
                     "hubei province",
                     "united states",
                     "confirmed cases",
                     "local businesses",
                     "tested postive",
                     "death toll",
                     "south korea",
                     "health emergenc*",
                     "chief medical",
                     "social media",
                     "last week",
                     "social distanc*",
                     "disease control",
                     "human right*",
                     "per cent*",
                     "health authorit*",
                     "travel* ban*",
                     "travel restrictions",
                     "health official*",
                     "donald trump",
                     "new cases",
                     #"health organi*",
                     "middle east",
                     "prime minister*",
                     "wall street"))


# loop through countries

countries <- unique(toks_treated$domain_country_merged)

length(countries)
# 56 countries

# load list of keywords and transform to dictionary
dat_dict <- rio::import("data_dictionary_china_positive.xlsx")

# remove specific words
dat_dict <- filter(dat_dict, !word %in% c("people"))

dat_dict$sentiment <- paste("f", 
                            str_to_lower(dat_dict$category_main), 
                            dat_dict$word, sep = "_")
dat_dict$sentiment <- str_replace_all(dat_dict$sentiment, " ", "_")


dat_dict$word <- str_replace_all(dat_dict$word, "_", " ")
dat_dict$word <- paste0("^", dat_dict$word)


dict <- as.dictionary(dat_dict)

# look up terms
dict


# empty data frames to store results
tstat_key_all <- data.frame()

dict_all <- data.frame()

for (i in countries) {
  
  cat("Analysing", i, "\n")
 
  toks_country  <- toks_treated %>%    
    tokens_subset(domain_country_merged == i) 
  
  toks_country_before <- tokens_subset(toks_country,
                                       treatment_group_3 != "Week of Mask Diplomacy")
  toks_country_treatment <- tokens_subset(toks_country, 
                                       treatment_group_3 == "Week of Mask Diplomacy")
  
  dfmat_dict <- toks_country %>% 
    tokens_lookup(dictionary = dict, valuetype = "regex") %>% # look up as regular expressions
    dfm() %>% 
    quanteda::convert(to = "data.frame")
    
  dfmat_docvars <- docvars(toks_country)
  
  dfmat_dict <- bind_cols(dfmat_docvars, dfmat_dict)
  
  dict_all <- bind_rows(dfmat_dict, dict_all)
  
}


# convert to long format for regression a nalysis
dat_analysis_long <- dict_all %>% 
  dplyr::select(c(starts_with("f_"), country, week, treatment_group_3,
                  documentidentifier)) %>% 
  gather(word, sum_word, -c(country, treatment_group_3, 
                            week,
                            documentidentifier)) %>% 
  group_by(word) %>% 
  mutate(count_word = sum(sum_word)) %>% 
  filter(count_word > 500) %>% # only keep words appearing at least 500 times
  ungroup() %>% 
  group_by(word, country) %>% 
  mutate(sum_word_stand = (sum_word - mean(sum_word, na.rm = TRUE)) / sd(sum_word, na.rm = TRUE))


dat_analysis_long$week <- factor(dat_analysis_long$week)
  
summary(dat_analysis_long$sum_word_stand)
summary(dat_analysis_long$sum_word)

cor.test(dat_analysis_long$sum_word_stand,
         dat_analysis_long$sum_word)


head(words)

# save dataset
saveRDS(dat_analysis_long, "data_words_count.rds")

# load data
# dat_analysis_long <- readRDS("data_words_count.rds")


# recode the word variable
dat_analysis_long <- dat_analysis_long %>% 
  mutate(word = str_remove_all(word, "f_media_"),
         word = str_remove_all(word, "f_other_"),
         word = str_remove_all(word, "f_whitepaper_"))

# get selected words
words <-  c("prevention",
            "support", 
            "coordination",
            "commitment",
            "fight",
            "help",
            "solidarity",
            "assistance")

# data frame for predicted values and incidence ratios
dat_predict_words <- data.frame()
dat_predict_words_exp_all <- data.frame()

#dat_predict_words_lm <- data.frame()
# data frame for coefficients
dat_words_coefs <- data.frame()

table(dat_analysis_long$treatment_group_3)

dat_analysis_long$treatment_group_3 <- factor(dat_analysis_long$treatment_group_3,
                                              levels = c("Week Before Mask Diplomacy",
                                                         "Week of Mask Diplomacy",
                                                         "Week After Mask Diplomacy"))

# run regressions in a loop for each term

for (i in words) {
  
  cat("Assess", i, "\n")
  
  dat_word <- filter(dat_analysis_long, word == i)
  
  glm_word <- glm.nb(sum_word ~ treatment_group_3 + country, 
                           data = dat_word)
  
  # https://stats.oarc.ucla.edu/r/dae/negative-binomial-regression/
  # get incidence ratios
  dat_predict_words_exp <- cbind(
    estimate = exp(coef(glm_word)), 
    exp(confint.default(glm_word, level = 0.95)), 
    exp(confint.default(glm_word, level = 0.9)))
  
  dat_predict_words_exp <- as.data.frame(dat_predict_words_exp)
  dat_predict_words_exp$coefficient <- rownames(dat_predict_words_exp)
  
  dat_predict_words_exp$word <- i
  
  # remove country coefficients and intercept from data frame
  dat_predict_words_exp <- dat_predict_words_exp %>% 
    filter(!str_detect(coefficient, "country")) %>% 
    filter(!str_detect(coefficient, "Intercept"))
  
  dat_predict_95 <- ggpredict(glm_word, terms = "treatment_group_3",
                           ci.lvl = 0.95) %>% 
    as.data.frame() %>% 
    rename(ci_lower_95 = conf.low,
           ci_higher_95 = conf.high)
  
  dat_predict_90 <- ggpredict(glm_word, terms = "treatment_group_3",
                              ci.lvl = 0.90) %>%
    as.data.frame() %>%
    rename(ci_lower_90 = conf.low,
           ci_higher_90 = conf.high)

  dat_predict <- left_join(dat_predict_90, dat_predict_95,
                           by = c("x", "predicted", "std.error", "group"))
  # 
  dat_predict$word <- i
  
  #dat_predict_words_lm <- bind_rows(dat_predict_95_lm, dat_predict
  
  dat_predict_words_exp_all <- bind_rows(dat_predict_words_exp_all,
                                        dat_predict_words_exp)
  dat_predict_words <- bind_rows(dat_predict_words, dat_predict)
  
}

# save objects to reproduce Figure 8 
save(dat_predict_words, dat_predict_words_exp_all,
     dat_analysis_long, 
     file = "data_fig_08.RData")

# load objects for reproducibility
load("data_fig_08.RData")

# set levels of factor
levels_treatment <- c("Week Before Mask Diplomacy", 
                      "Mask Diplomacy Week", 
                      "Week After Mask Diplomacy")


dat_predict_words_clean <- dat_predict_words %>% 
  mutate(treatment = ifelse(str_detect(x, "Before"), "Week Before Mask Diplomacy",
                            ifelse(str_detect(x, "After"), "Week After Mask Diplomacy", "Mask Diplomacy Week"))) %>% 
  mutate(treatment = factor(treatment, levels = levels_treatment))

# clean data frame with incidence ratios

dat_predict_words_exp_all_clean <- dat_predict_words_exp_all %>% 
  mutate(treatment = ifelse(str_detect(coefficient, "Before"), "Week Before Mask Diplomacy",
                            ifelse(str_detect(coefficient, "After"), "Week After Mask Diplomacy", "Mask Diplomacy Week"))) %>% 
  mutate(treatment = factor(treatment, levels = levels_treatment)) %>% 
  filter(word %in% 
           c("prevention",
             "support", 
             "coordination",
             "commitment",
             "fight",
             "help",
             "solidarity",
             "assistance"))


# Figure 08 
ggplot(filter(dat_predict_words_exp_all_clean,  treatment == "Week After Mask Diplomacy"),
       aes(x = reorder(word, estimate), y = estimate)) +
  geom_point(size = 3) +
  geom_linerange(aes(ymin = `2.5 %`,
                     ymax = `97.5 %`),
                 size = 0.5) +
  geom_linerange(aes(ymin = `5 %`,
                     ymax = `95 %`),
                 size = 1.3) +
  coord_flip() +
  geom_hline(yintercept = 1, linetype = "dashed") +
  scale_y_continuous(limits = c(c(0.9, 2.8)),
                     breaks = c(seq(1, 2.8, 0.2))) +
  labs(x = NULL, 
       y = "Incidence Rate Ratios: Increase in Word Frequencies in the Week After Receiving Mask Diplomacy \n(Relative to Week Before Mask Diplomacy)")
ggsave("fig_08.pdf",
       width = 9, height = 4)
ggsave("fig_08.png", 
       width = 9, height = 4, dpi = 300)
ggsave("fig_08.eps", 
       width = 9, height = 4, device = "eps")

# separate regression models for terms of interest

glm_solidarity <- glm.nb(sum_word ~ treatment_group_3 + country, 
                   data = filter(dat_analysis_long, word == "solidarity"))


glm_commitment <- glm.nb(sum_word ~ treatment_group_3 + country, 
                         data = filter(dat_analysis_long, word == "commitment"))


glm_coordination <- glm.nb(sum_word ~ treatment_group_3 + country, 
                          data = filter(dat_analysis_long, word == "coordination"))


glm_assisstance <- glm.nb(sum_word ~ treatment_group_3 + country, 
                           data = filter(dat_analysis_long, word == "assistance"))


glm_help <- glm.nb(sum_word ~ treatment_group_3 + country, 
                          data = filter(dat_analysis_long, word == "help"))


glm_fight <- glm.nb(sum_word ~ treatment_group_3 + country, 
                   data = filter(dat_analysis_long, word == "fight"))

glm_support <- glm.nb(sum_word ~ treatment_group_3 + country, 
                    data = filter(dat_analysis_long, word == "support"))

glm_prevention <- glm.nb(sum_word ~ treatment_group_3 + country, 
                      data = filter(dat_analysis_long, word == "prevention"))

# create regression table

models_main <- list()
models_main[['Solidarity']] <- glm_solidarity
models_main[['Commitment']] <- glm_commitment
models_main[['Coordination']] <- glm_coordination
models_main[['Assistance']] <- glm_assisstance
models_main[['Help']] <- glm_help
models_main[['Fight']] <- glm_fight
models_main[['Support']] <- glm_support
models_main[['Prevention']] <- glm_prevention

# modelsummary(models_main)

coefnames <- c("(Intercept)" = "(Intercept)",
               "treatment_group_3Week of Mask Diplomacy" = "Mask Diplomacy Week",
               "treatment_group_3Week After Mask Diplomacy" = "Week After Mask Diplomacy")


# Table A06
modelsummary(models_main,
             statistic = "std.error",
             fmt = "%.3f",
             stars = c('*' = .05, '**' = .01, '***' = .001),
             coef_omit = "(country*)",
             coef_rename = coefnames,
             output = "tab_a06.docx")
